library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
library(ggplot2)
library(ggmap)
library(mapdata)
## Loading required package: maps
## 
## Attaching package: 'maps'
## The following object is masked from 'package:plyr':
## 
##     ozone
library(maps)
library(stringr)
library(viridis)
## Loading required package: viridisLite
library(maptools)
## Loading required package: sp
## Checking rgeos availability: FALSE
##      Note: when rgeos is not available, polygon geometry     computations in maptools depend on gpclib,
##      which has a restricted licence. It is disabled by default;
##      to enable gpclib, type gpclibPermit()
library(gpclib) 
## General Polygon Clipper Library for R (version 1.5-5)
##  Type 'class ? gpc.poly' for help
library(sp)
gpclibPermit()
## Warning in gpclibPermit(): support for gpclib will be withdrawn from
## maptools at the next major release
## [1] TRUE
eda <- read.table("data/cleaned_AQ.csv", header = TRUE, sep = ",")

Proposition 1 We would like to figure out pollutants distribution across various regions of US based on years.

Data Preparation To meet the purpose of Proposition 1, let’s try to scoop out some of the chunk of datas that we might need to focus on. And then let’s try to free some of the memory for the data frame which we aren’t using.

eda1 <- subset(eda, select = c(latitude, 
                               longitude, 
                               parameter_name, 
                               method_name, 
                               year, 
                               arithmetic_mean, 
                               arithmetic_standard_dev, 
                               state_name))

rm(eda) 
#filter(eda1, !is.na(method_name) | method_name != "")

Okay, so let’s check out the datas across all the 30 years seperately. For the purpose of the Project’s scope limitation we would like to see the data for the year of 1987 and 2017, which indicates the beggining and the end of the air pollutants data available respectively.

# filter(eda1,year == "1987") 

filter(eda1,year == "1987") %>% head()
##   latitude  longitude              parameter_name
## 1 36.56689 -118.77732 Suspended particulate (TSP)
## 2 40.11222  -75.30917             Carbon monoxide
## 3 39.73643  -86.21332              Sulfur dioxide
## 4 41.61662  -87.14696       PM10 Total 0-10um STP
## 5 32.79119 -116.94209      Nitrogen dioxide (NO2)
## 6 39.31417  -76.61333                       Ozone
##                             method_name year arithmetic_mean
## 1        MEMBRANE-SAMPLER - GRAVIMETRIC 1987       25.518519
## 2 INSTRUMENTAL - NONDISPERSIVE INFRARED 1987        0.670625
## 3              Missing Data From Kaggle 1987       11.685389
## 4           HI-VOL-SA321A - GRAVIMETRIC 1987       38.757576
## 5      INSTRUMENTAL - CHEMILUMINESCENCE 1987       53.455056
## 6              Missing Data From Kaggle 1987        0.035872
##   arithmetic_standard_dev   state_name
## 1               21.079283   California
## 2                0.591866 Pennsylvania
## 3               13.279600      Indiana
## 4               24.645018      Indiana
## 5               26.144252   California
## 6                0.018500     Maryland
# filter(eda1,year == "1988")
# filter(eda1,year == "1989")
# filter(eda1,year == "1990")
# filter(eda1,year == "1991")
# filter(eda1,year == "1992")
# filter(eda1,year == "1993")
# filter(eda1,year == "1994")
# filter(eda1,year == "1995")
# filter(eda1,year == "1996")
# filter(eda1,year == "1997")
# filter(eda1,year == "1998")
# filter(eda1,year == "1999")
# filter(eda1,year == "2000")
# filter(eda1,year == "2001")
# filter(eda1,year == "2002")
# filter(eda1,year == "2003")
# filter(eda1,year == "2004")
# filter(eda1,year == "2005")
# filter(eda1,year == "2006")
# filter(eda1,year == "2007")
# filter(eda1,year == "2008")
# filter(eda1,year == "2009")
# filter(eda1,year == "2010")
# filter(eda1,year == "2011")
# filter(eda1,year == "2012")
# filter(eda1,year == "2013")
# filter(eda1,year == "2014")
# filter(eda1,year == "2015")
# filter(eda1,year == "2016")
# filter(eda1,year == "2017")
filter(eda1,year == "2017") %>% head() 
##   latitude longitude           parameter_name
## 1 41.24749 -95.97314 PM2.5 - Local Conditions
## 2 39.56333 -76.20389   Wind Speed - Resultant
## 3 41.84104 -71.36097        Relative Humidity
## 4 41.59851 -87.34299 PM2.5 - Local Conditions
## 5 30.11030 -84.99030                    Ozone
## 6 40.96911 -95.04495                    Ozone
##                                                           method_name year
## 1             Met One BAM-1020 Mass Monitor w/VSCC - Beta Attenuation 2017
## 2                             INSTRUMENTAL - VECTOR SUMMATION LEVEL 4 2017
## 3                             INSTRUMENTAL - HYGROSCOPIC PLASTIC FILM 2017
## 4 R & P Model 2025 PM-2.5 Sequential Air Sampler w/VSCC - Gravimetric 2017
## 5                                         INSTRUMENTAL - ULTRA VIOLET 2017
## 6                                            Missing Data From Kaggle 2017
##   arithmetic_mean arithmetic_standard_dev   state_name
## 1       10.721395                4.680987     Nebraska
## 2        5.921050                3.231377     Maryland
## 3       71.152758               20.909690 Rhode Island
## 4        9.786667                5.223144      Indiana
## 5        0.041389                0.007580      Florida
## 6        0.037717                0.008269         Iowa
# Filtering highest and lowest mean of pollutants data for the year 1987 
eda1987 <- eda1 %>% filter(year == "1987") %>% arrange(desc(arithmetic_mean))
eda1987 %>% head()
##   latitude longitude     parameter_name                     method_name
## 1 32.72727 -117.1545 Total hydrocarbons INSTRUMENTAL - FLAME IONIZATION
## 2 36.72523 -119.7513 Total hydrocarbons INSTRUMENTAL - FLAME IONIZATION
## 3 33.92899 -118.2107 Total hydrocarbons INSTRUMENTAL - FLAME IONIZATION
## 4 34.28040 -119.3146 Total hydrocarbons INSTRUMENTAL - FLAME IONIZATION
## 5 34.92442 -120.4018 Total hydrocarbons INSTRUMENTAL - FLAME IONIZATION
## 6 37.33967 -121.8886 Total hydrocarbons INSTRUMENTAL - FLAME IONIZATION
##   year arithmetic_mean arithmetic_standard_dev state_name
## 1 1987       20435.528               8998.2441 California
## 2 1987       17617.544              15073.3133 California
## 3 1987        3443.692               1858.0015 California
## 4 1987        2560.900               1264.9616 California
## 5 1987        2237.532                666.1922 California
## 6 1987        2196.192                522.5010 California
eda1987 %>% arrange(arithmetic_mean) %>% head()
##   latitude  longitude         parameter_name
## 1 34.51165 -120.49989    Vertical Wind Speed
## 2 44.58591  -68.80476 Temperature Difference
## 3 41.71776  -86.90779 Temperature Difference
## 4 34.40277 -119.45845    Vertical Wind Speed
## 5 40.00889  -75.09778    Beryllium (TSP) STP
## 6 38.25257  -85.74802           Bromomethane
##                                                      method_name year
## 1                   Instrumental - Electronic or machine average 1987
## 2 Instrumental - Electronic or machine average level 2 - level 1 1987
## 3 Instrumental - Electronic or machine average level 2 - level 1 1987
## 4                   Instrumental - Electronic or machine average 1987
## 5                                 HI-VOL - EMISSION SPECTRA ICAP 1987
## 6               CANISTER SUBAMBIENT PRESSURE - MULTI DETECTOR GC 1987
##   arithmetic_mean arithmetic_standard_dev   state_name
## 1       -0.209663                0.766333   California
## 2       -0.179139                1.171996        Maine
## 3       -0.140885                0.906252      Indiana
## 4       -0.123121                0.209601   California
## 5        0.000000                0.000000 Pennsylvania
## 6        0.000000                0.000000     Kentucky
# Filtering highest and lowest mean of pollutants data for the year 2017
eda2017 <- eda1 %>% filter(year == "2017") %>% arrange(desc(arithmetic_mean))
eda2017 %>% head()
##   latitude  longitude              parameter_name
## 1 41.79734  -71.41793 Particle Number Total COunt
## 2 41.80778  -71.41510 Particle Number Total COunt
## 3 39.75118 -104.98762                     Methane
## 4 37.05822  -88.57251         Elapsed Sample Time
## 5 37.10258 -107.87022         Elapsed Sample Time
## 6 38.06503  -84.49761         Elapsed Sample Time
##                                                                                     method_name
## 1 T-API 651/TSI 3783 at 3.0 lpm and 0.6 um cutpoint - Water-Based Condensation particle counter
## 2 T-API 651/TSI 3783 at 3.0 lpm and 0.6 um cutpoint - Water-Based Condensation particle counter
## 3                                            6L SUBAMBIENT SS-CANISTER - PRECONCENTRATOR GC/FID
## 4                              R & P Model 2025 PM-2.5 FEM Sequential AIr Sampler - Calculation
## 5                                         R & P Model 2000 PM-2.5 FEM Air Sampler - Calculation
## 6                              R & P Model 2025 PM-2.5 FEM Sequential AIr Sampler - Calculation
##   year arithmetic_mean arithmetic_standard_dev   state_name
## 1 2017        26889.56              23587.4956 Rhode Island
## 2 2017        13367.08               7754.3927 Rhode Island
## 3 2017         3152.50                333.6382     Colorado
## 4 2017         1440.00                  0.0000     Kentucky
## 5 2017         1440.00                  0.0000     Colorado
## 6 2017         1440.00                  0.0000     Kentucky
eda2017 %>% arrange(arithmetic_mean) %>% head()
##   latitude  longitude              parameter_name
## 1 39.33850 -120.17129 Average Ambient Temperature
## 2 43.66025  -70.26896 Average Ambient Temperature
## 3 42.68075  -73.75733     Ambient Min Temperature
## 4 44.39308  -73.85890 Average Ambient Temperature
## 5 42.49984  -96.39476     Ambient Min Temperature
## 6 40.81259  -96.68331     Ambient Min Temperature
##                                                       method_name year
## 1 R & P Model 2025 PM-2.5 FEM Sequential Air Sampler - Electronic 2017
## 2 R & P Model 2025 PM-2.5 FEM Sequential Air Sampler - Electronic 2017
## 3                     R & P Model 2025 PM2.5 Sequent - Electronic 2017
## 4                     R & P Model 2025 PM2.5 Sequent - Electronic 2017
## 5 R & P Model 2025 PM-2.5 FEM Sequential Air Sampler - Electronic 2017
## 6 R & P Model 2025 PM-2.5 FEM Sequential Air Sampler - Electronic 2017
##   arithmetic_mean arithmetic_standard_dev state_name
## 1       -6.133333                2.912044 California
## 2       -5.950000                3.717078      Maine
## 3       -5.309091                7.255957   New York
## 4       -4.875000                6.170562   New York
## 5       -4.286207                6.582592       Iowa
## 6       -3.886667                5.763787   Nebraska
# Creating Another Data Frame for comparing Mean Value of Pollutants Data Across the Year 1987 & 2017
eda2 <- eda1 %>% filter(year == "1987" | year == "2017") %>% arrange(desc(arithmetic_mean))
sapply(eda1987, count)
##      latitude     longitude    parameter_name method_name year
## x    Numeric,2556 Numeric,2558 factor,143     factor,114  1987
## freq Integer,2556 Integer,2558 Integer,143    Integer,114 6633
##      arithmetic_mean arithmetic_standard_dev state_name
## x    Numeric,6095    Numeric,6144            factor,54 
## freq Integer,6095    Integer,6144            Integer,54

So apparently looks like there are 143 pollutants listed and the number of tests done in that year of 1987 is 114. We need to next find out which are the top tests and the pollutants name which are mostly found in the year.

Histogram of Arithmetic Mean of Pollutants Distribution Across Entire US for 30 Years

ggplot(data = eda1) + geom_histogram(aes(x = arithmetic_mean), fill = "plum", color = "black") + labs(
    x = "Pollutant Mean",                                          # x axis title
    y = "Actual Pollutants",                                       # y axis title
    title = "Distribution of pollutants across US for 30 years"    # main title of figure
  ) + scale_x_log10()
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 35945 rows containing non-finite values (stat_bin).

Histogram of Arithmetic Mean of Pollutants Distribution Across Entire US for the Year of 1987

ggplot(data = eda1987) + 
  geom_histogram(aes(x = arithmetic_mean), fill = "steelblue", color = "black") + 
  labs(
    x = "Pollutant Mean",                                                 # x axis title
    y = "Actual Pollutants",                                       # y axis title
    title = "Distribution of pollutants across US for the year of 1987"       # main title of figure
  ) + scale_x_log10()
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 317 rows containing non-finite values (stat_bin).

Histogram of Arithmetic Mean of Pollutants Distribution Across Entire US for the Year of 2017

ggplot(data = eda2017) + 
  geom_histogram(aes(x = arithmetic_mean), fill = "salmon", color = "black") + 
  labs(
    x = "Pollutant Mean",                               # x axis title
    y = "Actual Pollutants",                            # y axis title
    title = "Distribution of pollutants across US for the year of 2017"       # main title of figure
  ) + 
  scale_x_log10()
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 401 rows containing non-finite values (stat_bin).

So apparently from the histogram distribution it looks like the distribution of the Pollutants comparatively decreased over the 30 years span. Let’s check some other paramters to validate this point.

Let’s check the variation of the mean value of the pollutants across the dataframes.

Density Plots

ggplot(eda1, aes(x=arithmetic_mean)) + 
  geom_density(fill = "plum", alpha = 0.90) + scale_x_log10()
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 35945 rows containing non-finite values (stat_density).

ggplot(eda1987, aes(x=arithmetic_mean)) + 
  geom_density(fill = "steelblue", alpha = 0.90) + scale_x_log10()
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 317 rows containing non-finite values (stat_density).

ggplot(eda2017, aes(x=arithmetic_mean)) + 
  geom_density(fill = "salmon", alpha = 0.90) + scale_x_log10()
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 401 rows containing non-finite values (stat_density).

Looks like the distribution of the pollutants got an almost similar patterns in the entire datasets to that of the year 2017. Howver for the year of 1987 the density of distribution of the pollutants looks small. The only difference in the pollution density distribution is that in the year of 1987 and 2017 where at one point some of the pollutants density seemed to be at higher than that of the 2017.

Kernel Density Overlaid on Histogram

# Spanned Over 30 Years
ggplot(eda1, aes(x=arithmetic_mean, y=..density..)) + 
  geom_histogram(fill="plum", color="grey60", size=.2) + 
  geom_density() + 
  scale_x_log10()
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 35945 rows containing non-finite values (stat_bin).
## Warning: Removed 35945 rows containing non-finite values (stat_density).

# For Year of 1987
ggplot(eda1, aes(x=arithmetic_mean, y=..density..)) + 
  geom_histogram(fill="steelblue", color="grey60", size=.2) + 
  geom_density() + 
  scale_x_log10()
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 35945 rows containing non-finite values (stat_bin).
## Warning: Removed 35945 rows containing non-finite values (stat_density).

# For Year of 2017
ggplot(eda1, aes(x=arithmetic_mean, y=..density..)) + 
  geom_histogram(fill="salmon", color="grey60", size=.2) + 
  geom_density() + 
  scale_x_log10()
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 35945 rows containing non-finite values (stat_bin).
## Warning: Removed 35945 rows containing non-finite values (stat_density).

Violin Plot

ggplot(eda1, aes(x="Pollutants", y=arithmetic_mean)) + 
  geom_violin(fill = "plum", alpha = 0.8) + 
  scale_y_log10()
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 35945 rows containing non-finite values (stat_ydensity).

ggplot(eda1987, aes(x="Pollutants", y=arithmetic_mean)) +
  geom_violin(fill = "steelblue", alpha = 0.8) + 
  scale_y_log10()
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 317 rows containing non-finite values (stat_ydensity).

ggplot(eda2017, aes(x="Pollutants", y=arithmetic_mean)) + 
  geom_violin(fill = "salmon", alpha = 0.8) + 
  scale_y_log10()
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 401 rows containing non-finite values (stat_ydensity).

Boxplots

ggplot(eda1, aes(y=arithmetic_mean, x=1)) + 
  geom_boxplot(fill = "plum", alpha = 0.2) + 
  scale_y_log10()
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 35945 rows containing non-finite values (stat_boxplot).

ggplot(eda1987, aes(y=arithmetic_mean, x=1)) + 
  geom_boxplot(fill = "steelblue", alpha = 0.2) + 
  scale_y_log10()
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 317 rows containing non-finite values (stat_boxplot).

ggplot(eda2017, aes(y=arithmetic_mean, x=1)) + 
  geom_boxplot(fill = "salmon", alpha = 0.2) + 
  scale_y_log10()
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 401 rows containing non-finite values (stat_boxplot).

As seen from both the Boxplot it can be said that: For Entire Dataset: The Median of the entire Pollutant mean is located almost exactly in the middle of the Box indicating a uniform distribution of Pollutants across Entire US as collected over the span of 30 years.

For 1987 US Data: The Median of the Pollutants Mean seems to be little more closer to the Upper end of the Spectrum (i.e towards the top whisker) indicating a higher amount of Pollutants distribution across the entire US during the year of 1987.

For 2017 US Data: The Median of the Pollutants Mean seems to be way more closer to the Upper end of the Spectrum (i.e towards the top whisker) indicating a higher amount of Pollutants distribution across the entire US during the year of 2017. However the entire Box size is smaller as compared to that of both the entire dataset as well as for the dataset of 1987, which could be probbaly due to less captured data value in the year of 2017 as compared to 1987.

Let’s try to get a visual comparison of the pollutants data for both the year of 1987 and 2017

ggplot(eda2, aes(x=factor(year), y=arithmetic_mean, color=year)) + 
  geom_boxplot() + 
  labs(
    x = "Year",                                                   # x axis title
    y = "Pollutant Mean",                                         # y axis title
    title = "Pollutant Distribution Across Year 1987 & 2017"      # main title of figure 
) + scale_y_log10()
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 718 rows containing non-finite values (stat_boxplot).

And we can say conclusively that Median of the Pollutant Mean is more closer to the left whisker or the Top Spectrum of the Box for both the year of 1987 and 2017. This indicates that the Median of the Pollutant Mean are at a higher side in the year of 2017 as compared to the year of 1987. However the entire difference in the size of both the Boxplots as seen in both the year of 1987 and 2017 seems to be different due to the distribution variation in the Mean of the Pollutants data so collected.

Let’s try to check which regions in US we have these pollutants data for.

Google Satellite Map

map1 <- get_map(location = c(lon = mean(eda1$longitude), 
                             lat = mean(eda1$latitude)), 
                zoom = 4,
                maptype = "satellite", 
                scale = 2)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=37.962973,-94.686641&zoom=4&size=640x640&scale=2&maptype=satellite&language=en-EN&sensor=false
#knitr::include_graphics('Documents/Projects/AirQuality/staticmap.png')

ggmap(map1) +
  geom_point(data = eda1, 
             aes(x = longitude, 
                 y = latitude, 
                 fill = "red", 
                 alpha = 0.8), 
             size = 5, 
             shape = 21) +
  guides(fill=FALSE, alpha=FALSE, size=FALSE)
## Warning: Removed 16732 rows containing missing values (geom_point).

Interestingly for the entire data sets looks like the pollutants data are collected from all over the US.

Let’s try to check the regions covered in the year of 1987.

map1987 <- get_map(location = c(lon = mean(eda1987$longitude), 
                                lat = mean(eda1987$latitude)), 
                   zoom = 4,
                   maptype = "satellite", 
                   scale = 2)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=38.676913,-92.553185&zoom=4&size=640x640&scale=2&maptype=satellite&language=en-EN&sensor=false
ggmap(map1987) +
  geom_point(data = eda1987, aes(x = longitude, 
                                 y = latitude, 
                                 fill = "red", 
                                 alpha = 0.8), 
             size = 5, 
             shape = 21) +
  guides(fill=FALSE, alpha=FALSE, size=FALSE)
## Warning: Removed 574 rows containing missing values (geom_point).

Let’s try to see the same for the year of 2017.

map2017 <- get_map(location = c(lon = mean(eda2017$longitude), 
                                lat = mean(eda2017$latitude)), 
                   zoom = 4,
                   maptype = "satellite", 
                   scale = 2)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=38.398833,-96.206211&zoom=4&size=640x640&scale=2&maptype=satellite&language=en-EN&sensor=false
ggmap(map2017) +
  geom_point(data = eda2017, aes(x = longitude, 
                                 y = latitude, 
                                 fill = "red", 
                                 alpha = 0.8), 
             size = 5, 
             shape = 21) +
  guides(fill=FALSE, alpha=FALSE, size=FALSE)
## Warning: Removed 96 rows containing missing values (geom_point).

Let’s try to visualize the States with the highest mean pollutants distribution.

ggplot(data = eda1) + 
  geom_histogram(aes(x = arithmetic_mean, 
                                         binwidth = 30000, 
                                         fill = state_name)) + 
  scale_x_log10() 
## Warning: Ignoring unknown aesthetics: binwidth
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 35945 rows containing non-finite values (stat_bin).

Okay, from the Histogram distribution, it looks like, some of the States such as Alabama, Alaska, Arizona have a higher level concentration of pollutants that are captured across the datas in these span of 30 years.

Let’s reuse some of the dataframes for our next level of analysis

write.csv(eda1, file = "data/eda1.csv")
write.csv(eda1987, file = "data/eda1987.csv")
write.csv(eda2017, file = "data/eda2017.csv") 
write.csv(eda2, file = "data/eda2.csv")   #Entire US with 2017 & 1987

Let’s try to free some of the used spaces for better use with our memory and for being able to fit other EDA’s conviniently.

rm(map1, map1987, map2017, eda1, eda1987, eda2, eda2017)

Conclusion